started: Alexey Larionov, 2016
last updated: Alexey Larionov, 09Feb2017
Overall, eigenvectors are calculated for 3 datasets:
This script deals with wecare-nfe dataset
# Time stamp
Sys.time()
## [1] "2017-02-21 10:17:17 GMT"
# Folders
setwd("/scratch/medgen/scripts/wecare_stat_01.17/scripts")
interim_data_folder <- "/scratch/medgen/scripts/wecare_stat_01.17/interim_data"
# Required libraries
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
load(paste(interim_data_folder, "r04_filter_cases_jan2017.RData", sep="/"))
ls()
## [1] "interim_data_folder" "wecare_nfe_exac.df"
## [3] "wecare_nfe_genotypes.mx" "wecare_nfe_kgen.df"
## [5] "wecare_nfe_phenotypes.df" "wecare_nfe_variants.df"
dim(wecare_nfe_genotypes.mx)
## [1] 275516 678
class(wecare_nfe_genotypes.mx)
## [1] "matrix"
wecare_nfe_genotypes.mx[1:5,1:5]
## HG00097 HG00099 HG00100 HG00102 HG00106
## Var000000002 0 0 NA NA 0
## Var000000008 0 NA 0 0 0
## Var000000009 0 0 0 0 0
## Var000000012 0 0 NA 0 0
## Var000000013 0 0 NA 0 0
dim(wecare_nfe_phenotypes.df)
## [1] 678 28
str(wecare_nfe_phenotypes.df)
## 'data.frame': 678 obs. of 28 variables:
## $ wes_id : chr "HG00097" "HG00099" "HG00100" "HG00102" ...
## $ gwas_id : chr NA NA NA NA ...
## $ merged_id : chr NA NA NA NA ...
## $ filter : chr "pass" "pass" "pass" "pass" ...
## $ cc : num -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
## $ setno : int NA NA NA NA NA NA NA NA NA NA ...
## $ family_history: int NA NA NA NA NA NA NA NA NA NA ...
## $ age_dx : int NA NA NA NA NA NA NA NA NA NA ...
## $ age_ref : num NA NA NA NA NA NA NA NA NA NA ...
## $ rstime : num NA NA NA NA NA NA NA NA NA NA ...
## $ eig1_gwas : num NA NA NA NA NA NA NA NA NA NA ...
## $ eig2_gwas : num NA NA NA NA NA NA NA NA NA NA ...
## $ eig3_gwas : num NA NA NA NA NA NA NA NA NA NA ...
## $ stage : num NA NA NA NA NA NA NA NA NA NA ...
## $ er : num NA NA NA NA NA NA NA NA NA NA ...
## $ pr : num NA NA NA NA NA NA NA NA NA NA ...
## $ hist_cat : chr NA NA NA NA ...
## $ hormone : num NA NA NA NA NA NA NA NA NA NA ...
## $ chemo_cat : chr NA NA NA NA ...
## $ br_xray : int NA NA NA NA NA NA NA NA NA NA ...
## $ br_xray_dose : num NA NA NA NA NA NA NA NA NA NA ...
## $ age_menarche : num NA NA NA NA NA NA NA NA NA NA ...
## $ age_1st_ftp : num NA NA NA NA NA NA NA NA NA NA ...
## $ age_menopause : num NA NA NA NA NA NA NA NA NA NA ...
## $ num_preg : num NA NA NA NA NA NA NA NA NA NA ...
## $ bmi_age18 : num NA NA NA NA NA NA NA NA NA NA ...
## $ bmi_dx : num NA NA NA NA NA NA NA NA NA NA ...
## $ bmi_ref : num NA NA NA NA NA NA NA NA NA NA ...
wecare_nfe_phenotypes.df[1:5,1:5]
## wes_id gwas_id merged_id filter cc
## HG00097 HG00097 <NA> <NA> pass -1
## HG00099 HG00099 <NA> <NA> pass -1
## HG00100 HG00100 <NA> <NA> pass -1
## HG00102 HG00102 <NA> <NA> pass -1
## HG00106 HG00106 <NA> <NA> pass -1
dim(wecare_nfe_variants.df)
## [1] 275516 23
colnames(wecare_nfe_variants.df)
## [1] "SplitVarID" "SYMBOL" "TYPE"
## [4] "CHROM" "POS" "REF"
## [7] "ALT" "AC" "AF"
## [10] "AN" "Consequence" "SIFT_call"
## [13] "SIFT_score" "PolyPhen_call" "PolyPhen_score"
## [16] "CLIN_SIG" "cDNA_position" "CDS_position"
## [19] "Codons" "Protein_position" "Amino_acids"
## [22] "Existing_variation" "Multiallelic"
wecare_nfe_variants.df[1:5,1:5]
## SplitVarID SYMBOL TYPE CHROM POS
## Var000000002 Var000000002 LINC00115 SNP 1 762330
## Var000000008 Var000000008 SAMD11 SNP 1 871215
## Var000000009 Var000000009 SAMD11 SNP 1 871230
## Var000000012 Var000000012 SAMD11 INDEL 1 874778
## Var000000013 Var000000013 SAMD11 SNP 1 878664
dim(wecare_nfe_kgen.df)
## [1] 275516 9
colnames(wecare_nfe_kgen.df)
## [1] "SplitVarID" "kgen.AC" "kgen.AN" "kgen.AF" "kgen.AFR_AF"
## [6] "kgen.AMR_AF" "kgen.EAS_AF" "kgen.EUR_AF" "kgen.SAS_AF"
wecare_nfe_kgen.df[1:5,1:5]
## SplitVarID kgen.AC kgen.AN kgen.AF kgen.AFR_AF
## Var000000002 Var000000002 NA NA NA NA
## Var000000008 Var000000008 NA NA NA NA
## Var000000009 Var000000009 NA NA NA NA
## Var000000012 Var000000012 NA NA NA NA
## Var000000013 Var000000013 2 5008 0.000399361 0
dim(wecare_nfe_exac.df)
## [1] 275516 48
colnames(wecare_nfe_exac.df)
## [1] "SplitVarID" "exac_non_TCGA.AF"
## [3] "exac_non_TCGA.AC" "exac_non_TCGA.AN"
## [5] "exac_non_TCGA.AC_FEMALE" "exac_non_TCGA.AN_FEMALE"
## [7] "exac_non_TCGA.AC_MALE" "exac_non_TCGA.AN_MALE"
## [9] "exac_non_TCGA.AC_Adj" "exac_non_TCGA.AN_Adj"
## [11] "exac_non_TCGA.AC_Hom" "exac_non_TCGA.AC_Het"
## [13] "exac_non_TCGA.AC_Hemi" "exac_non_TCGA.AC_AFR"
## [15] "exac_non_TCGA.AN_AFR" "exac_non_TCGA.Hom_AFR"
## [17] "exac_non_TCGA.Het_AFR" "exac_non_TCGA.Hemi_AFR"
## [19] "exac_non_TCGA.AC_AMR" "exac_non_TCGA.AN_AMR"
## [21] "exac_non_TCGA.Hom_AMR" "exac_non_TCGA.Het_AMR"
## [23] "exac_non_TCGA.Hemi_AMR" "exac_non_TCGA.AC_EAS"
## [25] "exac_non_TCGA.AN_EAS" "exac_non_TCGA.Hom_EAS"
## [27] "exac_non_TCGA.Het_EAS" "exac_non_TCGA.Hemi_EAS"
## [29] "exac_non_TCGA.AC_FIN" "exac_non_TCGA.AN_FIN"
## [31] "exac_non_TCGA.Hom_FIN" "exac_non_TCGA.Het_FIN"
## [33] "exac_non_TCGA.Hemi_FIN" "exac_non_TCGA.AC_NFE"
## [35] "exac_non_TCGA.AN_NFE" "exac_non_TCGA.Hom_NFE"
## [37] "exac_non_TCGA.Het_NFE" "exac_non_TCGA.Hemi_NFE"
## [39] "exac_non_TCGA.AC_SAS" "exac_non_TCGA.AN_SAS"
## [41] "exac_non_TCGA.Hom_SAS" "exac_non_TCGA.Het_SAS"
## [43] "exac_non_TCGA.Hemi_SAS" "exac_non_TCGA.AC_OTH"
## [45] "exac_non_TCGA.AN_OTH" "exac_non_TCGA.Hom_OTH"
## [47] "exac_non_TCGA.Het_OTH" "exac_non_TCGA.Hemi_OTH"
wecare_nfe_exac.df[1:5,1:5]
## SplitVarID exac_non_TCGA.AF exac_non_TCGA.AC
## Var000000002 Var000000002 0.0120000 175
## Var000000008 Var000000008 NA NA
## Var000000009 Var000000009 NA NA
## Var000000012 Var000000012 NA NA
## Var000000013 Var000000013 0.0005838 62
## exac_non_TCGA.AN exac_non_TCGA.AC_FEMALE
## Var000000002 14012 59
## Var000000008 NA NA
## Var000000009 NA NA
## Var000000012 NA NA
## Var000000013 106198 17
# Check consistency of colnames and rownames
sum(colnames(wecare_nfe_genotypes.mx) != rownames(wecare_nfe_phenotypes.df))
## [1] 0
sum(rownames(wecare_nfe_genotypes.mx) != rownames(wecare_nfe_variants.df))
## [1] 0
sum(rownames(wecare_nfe_genotypes.mx) != rownames(wecare_nfe_kgen.df))
## [1] 0
sum(rownames(wecare_nfe_genotypes.mx) != rownames(wecare_nfe_exac.df))
## [1] 0
Implements procedure described by Price et al 2006 (PMID: 16862161)
normalise_and_calculate_eigenvectors.udf <- function(x) {
# --- Center and normalise variants (rows) --- #
# Center by mean
avg.rows <- apply(x, 1, mean, na.rm=TRUE)
x.c <- x - avg.rows
# Normalise by sqrt(p(1-p)) where p~"posterior estimate of unobserved allele frequency"
# This is motivated by the fact that genetic drift per generation is proportional to this normalisation value (Patterson 2006)
# Also this makes each column to have same variance
#
p.fnc <- function(x) (1 + sum(x, na.rm=TRUE)) / (2 + 2 * sum(!is.na(x)))
p <- apply(x, 1, p.fnc)
eaf <- sqrt(p*(1-p))
x.cn <- x.c/eaf
# Substitute NAs to zeros
0 -> x.cn[is.na(x)]
# --- Calculate eigenvectors of covariance matrix of cases --- #
cov.mx <- cov(x.cn)
eig <- eigen(cov.mx) # eigenvectors in columns
return(eig)
}
# --- Calculate eigenvectors --- #
wecare_nfe_eigen <- normalise_and_calculate_eigenvectors.udf(wecare_nfe_genotypes.mx)
wecare_nfe_eigenvectors.df <- as.data.frame(wecare_nfe_eigen$vectors) # eigenvectors in columns
wecare_nfe_eigenvalues <- wecare_nfe_eigen$values
# --- Prepare data for plotting --- #
# Prepare cases lables
cases_labels <- as.vector(wecare_nfe_phenotypes.df$cc)
"NFE" -> cases_labels[cases_labels==-1]
"UBC" -> cases_labels[cases_labels==0]
"CBC" -> cases_labels[cases_labels==1]
summary(as.factor(cases_labels))
## CBC NFE UBC
## 235 198 245
# Cases IDs (for interactive plot)
cases_IDs <- as.vector(wecare_nfe_phenotypes.df$wes_id)
# Prepare colour scale
colours <- c("NFE" = "BLUE", "UBC" = "PINK", "CBC" = "RED")
userColourScale <- scale_colour_manual(values=colours)
# Data frame to plot
data2plot.df <- cbind(cases_IDs, cases_labels, wecare_nfe_eigenvectors.df[,1:3])
# --- Plot eig1 vs eig2 --- #
g <- ggplot(data2plot.df, aes(-V1,V2)) +
geom_point(aes(colour=cases_labels, text=cases_IDs)) +
labs(title="wecare-nfe<br>all variants (275,516 x 678)", x="-eigenvector1", y="eigenvector2") +
userColourScale
## Warning: Ignoring unknown aesthetics: text
ggplotly(g)
# --- Clean-up --- #
rm(wecare_nfe_eigenvectors.df, wecare_nfe_eigenvalues, g, data2plot.df)
Using 6 standard deviations in 5 eigenvectors
wecare_nfe_all_variants_eigenvectors.mx <- wecare_nfe_eigen$vectors
ev1 <- wecare_nfe_all_variants_eigenvectors.mx[,1]
ev1.positive_outliers <- ev1 > mean(ev1) + 6 * sd(ev1)
ev1.negative_outliers <- ev1 < mean(ev1) - 6 * sd(ev1)
sum(ev1.positive_outliers)
## [1] 0
sum(ev1.negative_outliers)
## [1] 5
wecare_nfe_phenotypes.df$wes_id[ev1.positive_outliers]
## character(0)
wecare_nfe_phenotypes.df$wes_id[ev1.negative_outliers]
## [1] "P1_C05" "P1_C12" "P1_D08" "P5_E09" "P6_D05"
ev2 <- wecare_nfe_all_variants_eigenvectors.mx[,2]
ev2.positive_outliers <- ev2 > mean(ev2) + 6 * sd(ev2)
ev2.negative_outliers <- ev2 < mean(ev2) - 6 * sd(ev2)
sum(ev2.positive_outliers)
## [1] 1
sum(ev2.negative_outliers)
## [1] 2
wecare_nfe_phenotypes.df$wes_id[ev2.positive_outliers]
## [1] "P5_E09"
wecare_nfe_phenotypes.df$wes_id[ev2.negative_outliers]
## [1] "P1_E06" "P6_D05"
ev3 <- wecare_nfe_all_variants_eigenvectors.mx[,3]
ev3.positive_outliers <- ev3 > mean(ev3) + 6 * sd(ev3)
ev3.negative_outliers <- ev3 < mean(ev3) - 6 * sd(ev3)
sum(ev3.positive_outliers)
## [1] 2
sum(ev3.negative_outliers)
## [1] 1
wecare_nfe_phenotypes.df$wes_id[ev3.positive_outliers]
## [1] "P1_C12" "P1_E06"
wecare_nfe_phenotypes.df$wes_id[ev3.negative_outliers]
## [1] "P6_D05"
ev4 <- wecare_nfe_all_variants_eigenvectors.mx[,4]
ev4.positive_outliers <- ev4 > mean(ev4) + 6 * sd(ev4)
ev4.negative_outliers <- ev4 < mean(ev4) - 6 * sd(ev4)
sum(ev4.positive_outliers)
## [1] 1
sum(ev4.negative_outliers)
## [1] 0
wecare_nfe_phenotypes.df$wes_id[ev4.positive_outliers]
## [1] "P1_E06"
wecare_nfe_phenotypes.df$wes_id[ev4.negative_outliers]
## character(0)
ev5 <- wecare_nfe_all_variants_eigenvectors.mx[,5]
ev5.positive_outliers <- ev5 > mean(ev5) + 6 * sd(ev5)
ev5.negative_outliers <- ev5 < mean(ev5) - 6 * sd(ev5)
sum(ev5.positive_outliers)
## [1] 1
sum(ev5.negative_outliers)
## [1] 1
wecare_nfe_phenotypes.df$wes_id[ev5.positive_outliers]
## [1] "P5_E09"
wecare_nfe_phenotypes.df$wes_id[ev5.negative_outliers]
## [1] "P1_C12"
# Clean-up
rm(wecare_nfe_all_variants_eigenvectors.mx, ev1, ev1.positive_outliers, ev1.negative_outliers,
ev2, ev2.positive_outliers, ev2.negative_outliers, ev3, ev3.positive_outliers, ev3.negative_outliers,
ev4, ev4.positive_outliers, ev4.negative_outliers, ev5, ev5.positive_outliers, ev5.negative_outliers)
# --- Calculate AFs --- #
ac_wecare_nfe_cln <- apply(wecare_nfe_genotypes.mx, 1, sum, na.rm=TRUE)
get_allele_number.udf <- function(x){2*sum(!is.na(x))}
an_wecare_nfe_cln <- apply(wecare_nfe_genotypes.mx, 1, get_allele_number.udf)
af_wecare_nfe_cln <- ac_wecare_nfe_cln/an_wecare_nfe_cln
# Ceck AFs
# (note that uniform variants were excluded)
ac_wecare_nfe_cln[1:10]
## Var000000002 Var000000008 Var000000009 Var000000012 Var000000013
## 5 4 1 44 1
## Var000000020 Var000000021 Var000000022 Var000000023 Var000000024
## 1 4 1 1 837
an_wecare_nfe_cln[1:10]
## Var000000002 Var000000008 Var000000009 Var000000012 Var000000013
## 1088 1128 1106 1114 1168
## Var000000020 Var000000021 Var000000022 Var000000023 Var000000024
## 1240 1248 1246 1240 1288
af_wecare_nfe_cln[1:10]
## Var000000002 Var000000008 Var000000009 Var000000012 Var000000013
## 0.0045955882 0.0035460993 0.0009041591 0.0394973070 0.0008561644
## Var000000020 Var000000021 Var000000022 Var000000023 Var000000024
## 0.0008064516 0.0032051282 0.0008025682 0.0008064516 0.6498447205
min(ac_wecare_nfe_cln)
## [1] 1
min(an_wecare_nfe_cln)
## [1] 1072
min(af_wecare_nfe_cln)
## [1] 0.0007374631
max(ac_wecare_nfe_cln)
## [1] 1355
max(an_wecare_nfe_cln)
## [1] 1356
max(af_wecare_nfe_cln)
## [1] 0.9992625
# Add updated AFs to wecare_nfe_variants.df
wecare_nfe_variants.df <- cbind(wecare_nfe_variants.df,
ac_wecare_nfe_cln, an_wecare_nfe_cln, af_wecare_nfe_cln)
# --- Exclude rare variants --- #
# Note exclusion on both sides: high- and low- AFs
# Low AFs remove rare variants with common allele in reference genome
# Hight AFs remove rare variants with common allele in reference genome
wecare_nfe_common_vars <- af_wecare_nfe_cln > 0.05 & af_wecare_nfe_cln < 0.95
sum(wecare_nfe_common_vars) # 44,919
## [1] 44919
min(af_wecare_nfe_cln[wecare_nfe_common_vars])
## [1] 0.05007474
max(af_wecare_nfe_cln[wecare_nfe_common_vars])
## [1] 0.9499253
common_wecare_nfe_genotypes.mx <- wecare_nfe_genotypes.mx[wecare_nfe_common_vars,]
common_wecare_nfe_variants.df <- wecare_nfe_variants.df[wecare_nfe_common_vars,]
common_wecare_nfe_kgen.df <- wecare_nfe_kgen.df[wecare_nfe_common_vars,]
common_wecare_nfe_exac.df <- wecare_nfe_exac.df[wecare_nfe_common_vars,]
# --- Calculate eigenvectors --- #
common_wecare_nfe_eigen <- normalise_and_calculate_eigenvectors.udf(common_wecare_nfe_genotypes.mx)
common_wecare_nfe_eigenvectors.df <- as.data.frame(common_wecare_nfe_eigen$vectors) # eigenvectors in columns
common_wecare_nfe_eigenvalues <- common_wecare_nfe_eigen$values
# --- Prepare data for plotting --- #
# Data frame to plot
data2plot.df <- cbind(cases_IDs, cases_labels, common_wecare_nfe_eigenvectors.df[,1:3])
# --- Plot eig1 vs eig2 --- #
g <- ggplot(data2plot.df, aes(V1,V2)) +
geom_point(aes(colour=cases_labels, text=cases_IDs)) +
labs(title="wecare-nfe<br>common variants (44,919 x 678)", x="eigenvector1", y="eigenvector2") +
userColourScale
## Warning: Ignoring unknown aesthetics: text
ggplotly(g)
# --- Clean-up --- #
rm(common_wecare_nfe_eigenvectors.df, colours, userColourScale, cases_IDs, cases_labels,
common_wecare_nfe_eigenvalues, g, normalise_and_calculate_eigenvectors.udf, data2plot.df,
get_allele_number.udf, ac_wecare_nfe_cln, an_wecare_nfe_cln, af_wecare_nfe_cln,
wecare_nfe_common_vars)
# --- Compare eigenvalues --- #
eval_all <- wecare_nfe_eigen$values
eval_common <- common_wecare_nfe_eigen$values
plot(eval_all, main="Wecare-nfe eigenvalues (all variants)")
plot(eval_common, main="Wecare-nfe eigenvalues (common variants)")
plot(eval_all,eval_common, main="Wecare-nfe eigenvalues (all vs common variants)")
# --- Compare eigenvectors --- #
# Gather data
ev1_all <- wecare_nfe_eigen$vectors[,1]
ev1_common <- common_wecare_nfe_eigen$vectors[,1]
ev2_all <- wecare_nfe_eigen$vectors[,2]
ev2_common <- common_wecare_nfe_eigen$vectors[,2]
ev3_all <- wecare_nfe_eigen$vectors[,3]
ev3_common <- common_wecare_nfe_eigen$vectors[,3]
data2plot.df <- as.data.frame(cbind(ev1_all, ev2_all, ev3_all, ev1_common, ev2_common, ev3_common))
# Calculate correlations
cor.test(ev1_all, ev1_common) # -0.1386195, p-value = 0.0002943
##
## Pearson's product-moment correlation
##
## data: ev1_all and ev1_common
## t = -3.6392, df = 676, p-value = 0.0002943
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.21170610 -0.06399117
## sample estimates:
## cor
## -0.1386195
cor.test(ev2_all, ev2_common) # -0.2703052, p-value = 8.125e-13
##
## Pearson's product-moment correlation
##
## data: ev2_all and ev2_common
## t = -7.2997, df = 676, p-value = 8.125e-13
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.3387078 -0.1990604
## sample estimates:
## cor
## -0.2703052
cor.test(ev3_all, ev3_common) # 0.04723419, p-value = 0.2193
##
## Pearson's product-moment correlation
##
## data: ev3_all and ev3_common
## t = 1.2295, df = 676, p-value = 0.2193
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.02816223 0.12209621
## sample estimates:
## cor
## 0.04723419
# Common sence check (these eigenvectors should be orthogonal...)
cor.test(ev1_all, ev2_all) # -3.530534e-16, p-value = 1
##
## Pearson's product-moment correlation
##
## data: ev1_all and ev2_all
## t = -9.1794e-15, df = 676, p-value = 1
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.07529626 0.07529626
## sample estimates:
## cor
## -3.530534e-16
cor.test(ev1_common, ev2_common) # -6.391521e-16, p-value = 1
##
## Pearson's product-moment correlation
##
## data: ev1_common and ev2_common
## t = -1.6618e-14, df = 676, p-value = 1
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.07529626 0.07529626
## sample estimates:
## cor
## -6.391521e-16
# Make plots
g <- ggplot(data2plot.df, aes(ev1_all, ev1_common)) +
geom_point() +
labs(title="Wecare-nfe eigenvector 1<br>all vs common variants (p=0.0003)")
ggplotly(g)
g <- ggplot(data2plot.df, aes(ev2_all, ev2_common)) +
geom_point() +
labs(title="Wecare-nfe eigenvector 2<br>all vs common variants (p=8e-13)")
ggplotly(g)
g <- ggplot(data2plot.df, aes(ev3_all, ev3_common)) +
geom_point() +
labs(title="Wecare-nfe eigenvector 3<br>all vs common variants (p-value = 0.2193)")
ggplotly(g)
# Clean-up
rm(eval_all, eval_common, ev1_all, ev2_all, ev3_all, ev1_common, ev2_common, ev3_common, g, data2plot.df)
Omitted
ls()
## [1] "common_wecare_nfe_eigen" "common_wecare_nfe_exac.df"
## [3] "common_wecare_nfe_genotypes.mx" "common_wecare_nfe_kgen.df"
## [5] "common_wecare_nfe_variants.df" "interim_data_folder"
## [7] "wecare_nfe_eigen" "wecare_nfe_exac.df"
## [9] "wecare_nfe_genotypes.mx" "wecare_nfe_kgen.df"
## [11] "wecare_nfe_phenotypes.df" "wecare_nfe_variants.df"
# wecare nfe all vars
dim(wecare_nfe_genotypes.mx)
## [1] 275516 678
class(wecare_nfe_genotypes.mx)
## [1] "matrix"
wecare_nfe_genotypes.mx[1:5,1:5]
## HG00097 HG00099 HG00100 HG00102 HG00106
## Var000000002 0 0 NA NA 0
## Var000000008 0 NA 0 0 0
## Var000000009 0 0 0 0 0
## Var000000012 0 0 NA 0 0
## Var000000013 0 0 NA 0 0
dim(wecare_nfe_phenotypes.df)
## [1] 678 28
str(wecare_nfe_phenotypes.df)
## 'data.frame': 678 obs. of 28 variables:
## $ wes_id : chr "HG00097" "HG00099" "HG00100" "HG00102" ...
## $ gwas_id : chr NA NA NA NA ...
## $ merged_id : chr NA NA NA NA ...
## $ filter : chr "pass" "pass" "pass" "pass" ...
## $ cc : num -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
## $ setno : int NA NA NA NA NA NA NA NA NA NA ...
## $ family_history: int NA NA NA NA NA NA NA NA NA NA ...
## $ age_dx : int NA NA NA NA NA NA NA NA NA NA ...
## $ age_ref : num NA NA NA NA NA NA NA NA NA NA ...
## $ rstime : num NA NA NA NA NA NA NA NA NA NA ...
## $ eig1_gwas : num NA NA NA NA NA NA NA NA NA NA ...
## $ eig2_gwas : num NA NA NA NA NA NA NA NA NA NA ...
## $ eig3_gwas : num NA NA NA NA NA NA NA NA NA NA ...
## $ stage : num NA NA NA NA NA NA NA NA NA NA ...
## $ er : num NA NA NA NA NA NA NA NA NA NA ...
## $ pr : num NA NA NA NA NA NA NA NA NA NA ...
## $ hist_cat : chr NA NA NA NA ...
## $ hormone : num NA NA NA NA NA NA NA NA NA NA ...
## $ chemo_cat : chr NA NA NA NA ...
## $ br_xray : int NA NA NA NA NA NA NA NA NA NA ...
## $ br_xray_dose : num NA NA NA NA NA NA NA NA NA NA ...
## $ age_menarche : num NA NA NA NA NA NA NA NA NA NA ...
## $ age_1st_ftp : num NA NA NA NA NA NA NA NA NA NA ...
## $ age_menopause : num NA NA NA NA NA NA NA NA NA NA ...
## $ num_preg : num NA NA NA NA NA NA NA NA NA NA ...
## $ bmi_age18 : num NA NA NA NA NA NA NA NA NA NA ...
## $ bmi_dx : num NA NA NA NA NA NA NA NA NA NA ...
## $ bmi_ref : num NA NA NA NA NA NA NA NA NA NA ...
wecare_nfe_phenotypes.df[1:5,1:5]
## wes_id gwas_id merged_id filter cc
## HG00097 HG00097 <NA> <NA> pass -1
## HG00099 HG00099 <NA> <NA> pass -1
## HG00100 HG00100 <NA> <NA> pass -1
## HG00102 HG00102 <NA> <NA> pass -1
## HG00106 HG00106 <NA> <NA> pass -1
dim(wecare_nfe_variants.df)
## [1] 275516 26
colnames(wecare_nfe_variants.df)
## [1] "SplitVarID" "SYMBOL" "TYPE"
## [4] "CHROM" "POS" "REF"
## [7] "ALT" "AC" "AF"
## [10] "AN" "Consequence" "SIFT_call"
## [13] "SIFT_score" "PolyPhen_call" "PolyPhen_score"
## [16] "CLIN_SIG" "cDNA_position" "CDS_position"
## [19] "Codons" "Protein_position" "Amino_acids"
## [22] "Existing_variation" "Multiallelic" "ac_wecare_nfe_cln"
## [25] "an_wecare_nfe_cln" "af_wecare_nfe_cln"
wecare_nfe_variants.df[1:5,1:5]
## SplitVarID SYMBOL TYPE CHROM POS
## Var000000002 Var000000002 LINC00115 SNP 1 762330
## Var000000008 Var000000008 SAMD11 SNP 1 871215
## Var000000009 Var000000009 SAMD11 SNP 1 871230
## Var000000012 Var000000012 SAMD11 INDEL 1 874778
## Var000000013 Var000000013 SAMD11 SNP 1 878664
dim(wecare_nfe_kgen.df)
## [1] 275516 9
colnames(wecare_nfe_kgen.df)
## [1] "SplitVarID" "kgen.AC" "kgen.AN" "kgen.AF" "kgen.AFR_AF"
## [6] "kgen.AMR_AF" "kgen.EAS_AF" "kgen.EUR_AF" "kgen.SAS_AF"
wecare_nfe_kgen.df[1:5,1:5]
## SplitVarID kgen.AC kgen.AN kgen.AF kgen.AFR_AF
## Var000000002 Var000000002 NA NA NA NA
## Var000000008 Var000000008 NA NA NA NA
## Var000000009 Var000000009 NA NA NA NA
## Var000000012 Var000000012 NA NA NA NA
## Var000000013 Var000000013 2 5008 0.000399361 0
dim(wecare_nfe_exac.df)
## [1] 275516 48
colnames(wecare_nfe_exac.df)
## [1] "SplitVarID" "exac_non_TCGA.AF"
## [3] "exac_non_TCGA.AC" "exac_non_TCGA.AN"
## [5] "exac_non_TCGA.AC_FEMALE" "exac_non_TCGA.AN_FEMALE"
## [7] "exac_non_TCGA.AC_MALE" "exac_non_TCGA.AN_MALE"
## [9] "exac_non_TCGA.AC_Adj" "exac_non_TCGA.AN_Adj"
## [11] "exac_non_TCGA.AC_Hom" "exac_non_TCGA.AC_Het"
## [13] "exac_non_TCGA.AC_Hemi" "exac_non_TCGA.AC_AFR"
## [15] "exac_non_TCGA.AN_AFR" "exac_non_TCGA.Hom_AFR"
## [17] "exac_non_TCGA.Het_AFR" "exac_non_TCGA.Hemi_AFR"
## [19] "exac_non_TCGA.AC_AMR" "exac_non_TCGA.AN_AMR"
## [21] "exac_non_TCGA.Hom_AMR" "exac_non_TCGA.Het_AMR"
## [23] "exac_non_TCGA.Hemi_AMR" "exac_non_TCGA.AC_EAS"
## [25] "exac_non_TCGA.AN_EAS" "exac_non_TCGA.Hom_EAS"
## [27] "exac_non_TCGA.Het_EAS" "exac_non_TCGA.Hemi_EAS"
## [29] "exac_non_TCGA.AC_FIN" "exac_non_TCGA.AN_FIN"
## [31] "exac_non_TCGA.Hom_FIN" "exac_non_TCGA.Het_FIN"
## [33] "exac_non_TCGA.Hemi_FIN" "exac_non_TCGA.AC_NFE"
## [35] "exac_non_TCGA.AN_NFE" "exac_non_TCGA.Hom_NFE"
## [37] "exac_non_TCGA.Het_NFE" "exac_non_TCGA.Hemi_NFE"
## [39] "exac_non_TCGA.AC_SAS" "exac_non_TCGA.AN_SAS"
## [41] "exac_non_TCGA.Hom_SAS" "exac_non_TCGA.Het_SAS"
## [43] "exac_non_TCGA.Hemi_SAS" "exac_non_TCGA.AC_OTH"
## [45] "exac_non_TCGA.AN_OTH" "exac_non_TCGA.Hom_OTH"
## [47] "exac_non_TCGA.Het_OTH" "exac_non_TCGA.Hemi_OTH"
wecare_nfe_exac.df[1:5,1:5]
## SplitVarID exac_non_TCGA.AF exac_non_TCGA.AC
## Var000000002 Var000000002 0.0120000 175
## Var000000008 Var000000008 NA NA
## Var000000009 Var000000009 NA NA
## Var000000012 Var000000012 NA NA
## Var000000013 Var000000013 0.0005838 62
## exac_non_TCGA.AN exac_non_TCGA.AC_FEMALE
## Var000000002 14012 59
## Var000000008 NA NA
## Var000000009 NA NA
## Var000000012 NA NA
## Var000000013 106198 17
sum(colnames(wecare_nfe_genotypes.mx) != rownames(wecare_nfe_phenotypes.df))
## [1] 0
sum(rownames(wecare_nfe_genotypes.mx) != rownames(wecare_nfe_variants.df))
## [1] 0
sum(rownames(wecare_nfe_genotypes.mx) != rownames(wecare_nfe_kgen.df))
## [1] 0
sum(rownames(wecare_nfe_genotypes.mx) != rownames(wecare_nfe_exac.df))
## [1] 0
# wecare nfe common vars
dim(common_wecare_nfe_genotypes.mx)
## [1] 44919 678
class(common_wecare_nfe_genotypes.mx)
## [1] "matrix"
common_wecare_nfe_genotypes.mx[1:5,1:5]
## HG00097 HG00099 HG00100 HG00102 HG00106
## Var000000024 0 2 2 2 1
## Var000000027 0 0 0 0 0
## Var000000037 0 0 0 0 0
## Var000000054 2 2 2 2 2
## Var000000058 2 2 NA 2 NA
dim(common_wecare_nfe_variants.df)
## [1] 44919 26
colnames(common_wecare_nfe_variants.df)
## [1] "SplitVarID" "SYMBOL" "TYPE"
## [4] "CHROM" "POS" "REF"
## [7] "ALT" "AC" "AF"
## [10] "AN" "Consequence" "SIFT_call"
## [13] "SIFT_score" "PolyPhen_call" "PolyPhen_score"
## [16] "CLIN_SIG" "cDNA_position" "CDS_position"
## [19] "Codons" "Protein_position" "Amino_acids"
## [22] "Existing_variation" "Multiallelic" "ac_wecare_nfe_cln"
## [25] "an_wecare_nfe_cln" "af_wecare_nfe_cln"
common_wecare_nfe_variants.df[1:5,1:5]
## SplitVarID SYMBOL TYPE CHROM POS
## Var000000024 Var000000024 NOC2L SNP 1 881627
## Var000000027 Var000000027 NOC2L SNP 1 881918
## Var000000037 Var000000037 NOC2L SNP 1 889238
## Var000000054 Var000000054 KLHL17 SNP 1 897325
## Var000000058 Var000000058 KLHL17 SNP 1 897564
dim(common_wecare_nfe_kgen.df)
## [1] 44919 9
colnames(common_wecare_nfe_kgen.df)
## [1] "SplitVarID" "kgen.AC" "kgen.AN" "kgen.AF" "kgen.AFR_AF"
## [6] "kgen.AMR_AF" "kgen.EAS_AF" "kgen.EUR_AF" "kgen.SAS_AF"
common_wecare_nfe_kgen.df[1:5,1:5]
## SplitVarID kgen.AC kgen.AN kgen.AF kgen.AFR_AF
## Var000000024 Var000000024 2213 5008 0.4418930 0.0643
## Var000000027 Var000000027 147 5008 0.0293530 0.0038
## Var000000037 Var000000037 283 5008 0.0565096 0.0197
## Var000000054 Var000000054 4303 5008 0.8592250 0.6959
## Var000000058 Var000000058 4507 5008 0.8999600 0.8298
dim(common_wecare_nfe_exac.df)
## [1] 44919 48
colnames(common_wecare_nfe_exac.df)
## [1] "SplitVarID" "exac_non_TCGA.AF"
## [3] "exac_non_TCGA.AC" "exac_non_TCGA.AN"
## [5] "exac_non_TCGA.AC_FEMALE" "exac_non_TCGA.AN_FEMALE"
## [7] "exac_non_TCGA.AC_MALE" "exac_non_TCGA.AN_MALE"
## [9] "exac_non_TCGA.AC_Adj" "exac_non_TCGA.AN_Adj"
## [11] "exac_non_TCGA.AC_Hom" "exac_non_TCGA.AC_Het"
## [13] "exac_non_TCGA.AC_Hemi" "exac_non_TCGA.AC_AFR"
## [15] "exac_non_TCGA.AN_AFR" "exac_non_TCGA.Hom_AFR"
## [17] "exac_non_TCGA.Het_AFR" "exac_non_TCGA.Hemi_AFR"
## [19] "exac_non_TCGA.AC_AMR" "exac_non_TCGA.AN_AMR"
## [21] "exac_non_TCGA.Hom_AMR" "exac_non_TCGA.Het_AMR"
## [23] "exac_non_TCGA.Hemi_AMR" "exac_non_TCGA.AC_EAS"
## [25] "exac_non_TCGA.AN_EAS" "exac_non_TCGA.Hom_EAS"
## [27] "exac_non_TCGA.Het_EAS" "exac_non_TCGA.Hemi_EAS"
## [29] "exac_non_TCGA.AC_FIN" "exac_non_TCGA.AN_FIN"
## [31] "exac_non_TCGA.Hom_FIN" "exac_non_TCGA.Het_FIN"
## [33] "exac_non_TCGA.Hemi_FIN" "exac_non_TCGA.AC_NFE"
## [35] "exac_non_TCGA.AN_NFE" "exac_non_TCGA.Hom_NFE"
## [37] "exac_non_TCGA.Het_NFE" "exac_non_TCGA.Hemi_NFE"
## [39] "exac_non_TCGA.AC_SAS" "exac_non_TCGA.AN_SAS"
## [41] "exac_non_TCGA.Hom_SAS" "exac_non_TCGA.Het_SAS"
## [43] "exac_non_TCGA.Hemi_SAS" "exac_non_TCGA.AC_OTH"
## [45] "exac_non_TCGA.AN_OTH" "exac_non_TCGA.Hom_OTH"
## [47] "exac_non_TCGA.Het_OTH" "exac_non_TCGA.Hemi_OTH"
common_wecare_nfe_exac.df[1:5,1:5]
## SplitVarID exac_non_TCGA.AF exac_non_TCGA.AC
## Var000000024 Var000000024 0.562 59637
## Var000000027 Var000000027 0.045 4735
## Var000000037 Var000000037 NA NA
## Var000000054 Var000000054 NA NA
## Var000000058 Var000000058 0.898 70551
## exac_non_TCGA.AN exac_non_TCGA.AC_FEMALE
## Var000000024 106210 24874
## Var000000027 106208 1867
## Var000000037 NA NA
## Var000000054 NA NA
## Var000000058 78540 4650
sum(colnames(common_wecare_nfe_genotypes.mx) != rownames(wecare_nfe_phenotypes.df))
## [1] 0
sum(rownames(common_wecare_nfe_genotypes.mx) != rownames(common_wecare_nfe_variants.df))
## [1] 0
sum(rownames(common_wecare_nfe_genotypes.mx) != rownames(common_wecare_nfe_kgen.df))
## [1] 0
sum(rownames(common_wecare_nfe_genotypes.mx) != rownames(common_wecare_nfe_exac.df))
## [1] 0
save.image(paste(interim_data_folder, "r05b_calculate_egenvectors_wecare_nfe_jan2017.RData", sep="/"))
ls()
## [1] "common_wecare_nfe_eigen" "common_wecare_nfe_exac.df"
## [3] "common_wecare_nfe_genotypes.mx" "common_wecare_nfe_kgen.df"
## [5] "common_wecare_nfe_variants.df" "interim_data_folder"
## [7] "wecare_nfe_eigen" "wecare_nfe_exac.df"
## [9] "wecare_nfe_genotypes.mx" "wecare_nfe_kgen.df"
## [11] "wecare_nfe_phenotypes.df" "wecare_nfe_variants.df"
sessionInfo()
## R version 3.2.3 (2015-12-10)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Scientific Linux release 6.8 (Carbon)
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] plotly_4.5.6 ggplot2_2.2.1 dplyr_0.5.0
##
## loaded via a namespace (and not attached):
## [1] Rcpp_0.12.6 knitr_1.13 magrittr_1.5
## [4] munsell_0.4.3 viridisLite_0.1.3 colorspace_1.2-6
## [7] R6_2.1.2 httr_1.2.1 stringr_1.0.0
## [10] plyr_1.8.4 tools_3.2.3 grid_3.2.3
## [13] gtable_0.2.0 DBI_0.5 htmltools_0.3.5
## [16] yaml_2.1.13 lazyeval_0.2.0 assertthat_0.1
## [19] digest_0.6.10 tibble_1.1 tidyr_0.5.1
## [22] purrr_0.2.2 formatR_1.4 base64enc_0.1-3
## [25] htmlwidgets_0.8 evaluate_0.9 rmarkdown_1.0
## [28] labeling_0.3 stringi_1.1.1 scales_0.4.1
## [31] jsonlite_1.0
Sys.time()
## [1] "2017-02-21 10:20:10 GMT"